Versions

V19 - updated to survey_data.2020-07-08_17.17.10.txt, then survey_data.2020-07-14_11.31.11.txt

V18 - updated to survey_data.2020-07-02_11.01.36.txt

V17 - Change sample to dataset; change project to cohort

V16 - use namer on chunks

V15 - change correlation plots to small dots

V14 - combine plots

V13 - change gene count threshold from categporical to numeric

V12 - exclude datasets with fewer than 100 measured genes and use pre-generated survey_data

V11 - color consistency

V10 - calculate non genic and all exonic counts for correlations analysis

V9 - add gene count; import readdist.txt for more nuanced comparison of cor with gene count

Overview

reference range for unmapped reads is calculated based on unmapped reads/total

reference range for duplicate reads is calculated based on duplicate reads/mapped reads where mapped reads = total mapped and multi-mapped (<20x) reads)

reference range for non-exonic reads is calculated based on non-exonic/non duplicate reads) where non duplicate reads = exonic and non-exonic non dupe reads)

The composition fractions are not dependent on higher level ones

Libraries

library(tidyverse)
## ── Attaching packages ───────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.0     ✓ purrr   0.3.4
## ✓ tibble  3.0.1     ✓ dplyr   1.0.0
## ✓ tidyr   1.1.0     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.5.0
## ── Conflicts ──────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(janitor)
## 
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test
library(viridis)
## Loading required package: viridisLite
library(knitr)
library(ggthemes) # for theme_linedraw() 
library(ggpubr) # for stat_cor
library(cowplot) # for ggdraw and draw_label
## 
## ********************************************************
## Note: As of version 1.0.0, cowplot does not change the
##   default ggplot2 theme anymore. To recover the previous
##   behavior, execute:
##   theme_set(theme_cowplot())
## ********************************************************
## 
## Attaching package: 'cowplot'
## The following object is masked from 'package:ggpubr':
## 
##     get_legend
## The following object is masked from 'package:ggthemes':
## 
##     theme_map
library(RColorBrewer)
library(pander)
library(kableExtra)
## 
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
## 
##     group_rows

Config

inputs_dir <- "inputs"
data_file <- tibble(files = list.files(inputs_dir)) %>%
  arrange(desc(files)) %>%
  top_n(1, wt = files) %>%
  pull(files)

print(data_file)
## [1] "survey_data.2020-07-14_12.13.18.txt"

Settings

min_genes_gt_0 <- 100

Functions

is_outlier <- function(x) {
  (x > quantile(x, 0.75) + 1.5*IQR(x)) |
    (x < quantile(x, 0.25) - 1.5*IQR(x))
}


most_common_val_w_percent <- function(value_vector = c("a", "a", "b")){
  mc_vals <- tabyl(value_vector) %>%
    arrange(desc(n)) %>%
    top_n(1, n) 
  paste0(round(100*mc_vals$percent), "% ", mc_vals$value_vector)
}

# StatBin2 allows depiction of empty bins as blank instead of a horizontal line:
# https://stackoverflow.com/questions/57128090/remove-baseline-color-for-geom-histogram
StatBin2 <- ggproto(
  "StatBin2", 
  StatBin,
  compute_group = function (data, scales, binwidth = NULL, bins = NULL, 
                            center = NULL, boundary = NULL, 
                            closed = c("right", "left"), pad = FALSE, 
                            breaks = NULL, origin = NULL, right = NULL, 
                            drop = NULL, width = NULL) {
    if (!is.null(breaks)) {
      if (!scales$x$is_discrete()) {
        breaks <- scales$x$transform(breaks)
      }
      bins <- ggplot2:::bin_breaks(breaks, closed)
    }
    else if (!is.null(binwidth)) {
      if (is.function(binwidth)) {
        binwidth <- binwidth(data$x)
      }
      bins <- ggplot2:::bin_breaks_width(scales$x$dimension(), binwidth, 
                                         center = center, boundary = boundary, 
                                         closed = closed)
    }
    else {
      bins <- ggplot2:::bin_breaks_bins(scales$x$dimension(), bins, 
                                        center = center, boundary = boundary, 
                                        closed = closed)
    }
    res <- ggplot2:::bin_vector(data$x, bins, weight = data$weight, pad = pad)

    # drop 0-count bins completely before returning the dataframe
    res <- res[res$count > 0, ] 

    res
  })

Import data

col_spec <- cols(
  .default = col_double(),
  dataset_id = col_character(),
  disease = col_character(),
  dataset_prefix = col_character(),
  pedaya = col_character(),
  gender = col_character(),
  doi_link = col_character(),
  source_accession = col_character(),
  repo = col_character(),
  original_dataset_id = col_character(),
  cohort = col_character()
)

counts_and_meta <- read_tsv(file.path(inputs_dir, data_file), col_types = col_spec) %>%
  group_by(cohort) %>%
  mutate(n_in_cohort = n())

Colors

# brewer.pal(8, "Paired")

these_colors <- c("#A6CEE3", "#1F78B4", "#B2DF8A", "#33A02C", "#FB9A99", "#E31A1C", 
"#FDBF6F", "#FF7F00")

# light blue, dark blue, etc: green, red, orange (then purple and brown)

these_colors <- c("#1F78B4", "#1F78B4", "#33A02C", "#33A02C", "#E31A1C", "#E31A1C", 
"#FF7F00", "#FF7F00")

names(these_colors) <-  c("not assigned", "Total reads", "Non-exonic reads",  "Exonic reads", "Duplicate reads",  "Non-duplicate reads",  "Unmapped reads", "Mapped reads")

these_colors_with_MEND = c(these_colors, "MEND reads"="#000000")

Datasets analyzed

nrow(counts_and_meta)
## [1] 2177

Diseases in dataset

counts_and_meta %>% tabyl(disease)
##                                     disease   n      percent
##                acute lymphoblastic leukemia 680 0.3123564538
##             acute megakaryoblastic leukemia 103 0.0473128158
##                      acute myeloid leukemia 221 0.1015158475
##             acute undifferentiated leukemia   1 0.0004593477
##                      adrenocortical adenoma   1 0.0004593477
##                       adrenocortical cancer   2 0.0009186955
##                    adrenocortical carcinoma  19 0.0087276068
##                   alveolar rhabdomyosarcoma  40 0.0183739090
##                          cholangiocarcinoma   1 0.0004593477
##                    choroid plexus carcinoma  25 0.0114836932
##         desmoplastic small round cell tumor   4 0.0018373909
##      dysembryoplastic neuroepithelial tumor  13 0.0059715204
##                  embryonal rhabdomyosarcoma  42 0.0192926045
##  embryonal tumor with multilayered rosettes   1 0.0004593477
##                                  ependymoma  97 0.0445567294
##                         epithelioid sarcoma   1 0.0004593477
##                               Ewing sarcoma  70 0.0321543408
##      fibrolamellar hepatocellular carcinoma   3 0.0013780432
##                                fibromatosis   1 0.0004593477
##              gastrointestinal stromal tumor   4 0.0018373909
##                             germ cell tumor   1 0.0004593477
##                     glioblastoma multiforme  29 0.0133210841
##                                      glioma 193 0.0886541112
##                              hepatoblastoma   1 0.0004593477
##                    hepatocellular carcinoma   3 0.0013780432
##                 hypereosinophillic syndrome   1 0.0004593477
##            juvenile myelomonocytic leukemia   1 0.0004593477
##                                    leukemia   1 0.0004593477
##                                    lymphoma  49 0.0225080386
##                             medulloblastoma 201 0.0923288930
##                                    melanoma   7 0.0032154341
##             melanotic neuroectodermal tumor   1 0.0004593477
##                                  meningioma   1 0.0004593477
##                             myofibromatosis   2 0.0009186955
##                    nasopharyngeal carcinoma   2 0.0009186955
##                               neuroblastoma  12 0.0055121727
##                    neuroendocrine carcinoma   1 0.0004593477
##                                not reported   2 0.0009186955
##                not reported - QC fail by PI   1 0.0004593477
##                                osteosarcoma 157 0.0721175930
##           ovarian serous cystadenocarcinoma   1 0.0004593477
##                                      PEComa   1 0.0004593477
##                    pleuropulmonary blastoma   1 0.0004593477
##                              retinoblastoma   2 0.0009186955
##                              rhabdoid tumor  65 0.0298576022
##                            rhabdomyosarcoma  53 0.0243454295
##    spindle cell/sclerosing rhabdomyosarcoma   3 0.0013780432
##          supratentorial embryonal tumor NOS  18 0.0082682591
##                            synovial sarcoma  22 0.0101056500
##                undifferentiated sarcoma NOS   3 0.0013780432
##       undifferentiated spindle cell sarcoma   2 0.0009186955
##                                 wilms tumor  11 0.0050528250
disease_freq_table <- counts_and_meta$disease %>%
  str_to_sentence %>%
  str_replace(" nos$", " NOS") %>%
  as.factor %>%
  fct_infreq %>%
  fct_lump(prop=0.01) %>%
  tabyl (var1 = "Disease") %>%
  adorn_pct_formatting(digits = 1)

colnames(disease_freq_table)[1]="Disease"

write_tsv(disease_freq_table, "results/disease_freq_table.txt")

disease_freq_table  %>%
  kable %>% 
  kable_styling(bootstrap_options = "striped", full_width = F)
Disease n percent
Acute lymphoblastic leukemia 680 31.2%
Acute myeloid leukemia 221 10.2%
Medulloblastoma 201 9.2%
Glioma 193 8.9%
Osteosarcoma 157 7.2%
Acute megakaryoblastic leukemia 103 4.7%
Ependymoma 97 4.5%
Ewing sarcoma 70 3.2%
Rhabdoid tumor 65 3.0%
Rhabdomyosarcoma 53 2.4%
Lymphoma 49 2.3%
Embryonal rhabdomyosarcoma 42 1.9%
Alveolar rhabdomyosarcoma 40 1.8%
Glioblastoma multiforme 29 1.3%
Choroid plexus carcinoma 25 1.1%
Synovial sarcoma 22 1.0%
Other 130 6.0%
# many are leukemias
table(str_detect(counts_and_meta$disease, "leukemia"))
## 
## FALSE  TRUE 
##  1170  1007

Cohorts in dataset

library(ggthemes)

old_cohort_size_histogram <- ggplot(counts_and_meta) + 
  geom_histogram(aes(n_in_cohort, group=cohort), 
                 fill = "lightgrey", color = "black", stat = StatBin2) +
  theme_minimal(base_size = 20) +
  theme(axis.line.x = element_line(color="darkgrey", size = 0.5),
        axis.line.y = element_line(color="darkgrey", size = 0.5)) +  xlab("Cohort size") +
  ylab("Cohorts") + 
  theme(legend.position="none")

cohort_size_histogram <- ggplot(counts_and_meta %>%
  select(cohort, n_in_cohort) %>%
  distinct ) + 
  geom_histogram(aes(n_in_cohort), 
                 fill = "lightgrey", color = "black", stat = StatBin2,
                 breaks = seq(0,500, by=10)) +
  theme_minimal(base_size = 20) +
  theme(axis.line.x = element_line(color="darkgrey", size = 0.5),
        axis.line.y = element_line(color="darkgrey", size = 0.5)) +  
  xlab("Cohort size") +
  scale_y_continuous("Cohorts", breaks = seq(0, 8, 2)) + 
  theme(legend.position="none")

cohort_size_histogram

# for cohorts that are more than 75% one disease, color by that disease

n_distinct(counts_and_meta$cohort)
## [1] 41
sort(unique(counts_and_meta$n_in_cohort))
##  [1]   3   4   6   7   8  10  14  15  17  19  22  23  24  25  31  35  36  39  43
## [20]  52  62  63  65  74  75  77  84  88  89 116 127 137 228 337
counts_and_meta %>%
  select(cohort, n_in_cohort) %>%
  distinct %>%
  arrange(desc(n_in_cohort)) %>%
  kable
cohort n_in_cohort
TARGET-10 337
10.1056/NEJMoa1403088 228
TARGET-20 137
EGAD00001003279 127
St Jude Cloud NOS 116
phs000673.v2.p1 89
TARGET-40 88
phs000720.v2.p1 84
10.1038/ng.3772 77
10.1038/ng.2938 75
10.1038/nature13109 74
TARGET-52 65
EGAD00001001098 63
phs000768.v2.p1 62
10.1038/ng.2611 52
10.1016/j.ccr.2013.11.002 43
EGAD00001001620 39
10.1038/ng.3709 36
phs000699.v1.p1 35
10.24370/SD_BHJXBDQK 31
EGAD00001001666 25
phs000900.v1.p1 24
EGAD00001000648 24
TARGET-21 23
EGAD00001000356 23
EGAD00001000617 23
10.1016/j.celrep.2014.03.003 23
EGAD00001001927 22
EGAD00001002680 19
SRP126664 19
EGAD00001000158 17
SRP092501 15
10.1016/j.ccr.2012.10.007 14
EGAD00001000826 10
10.1186/gb-2012-13-12-r113 8
TARGET-30 7
EGAD00001000328 6
10.1038/ng.3230 6
TARGET-50 4
SRP006575 4
SRP040454 3
counts_and_meta %>%
  select(cohort, n_in_cohort) %>%
  distinct %>%
  pull(n_in_cohort) %>%
  median
## [1] 25

Genders in dataset

table(counts_and_meta$gender, useNA = "always")
## 
##       female         male not reported      unknown         <NA> 
##          738         1010          309           82           38
# samples with reported genders
counts_and_meta %>% 
  filter(! is.na(gender)) %>%
  filter(! gender %in% c("not reported", "unknown")) %>%
  tabyl(gender) %>%
  adorn_totals
##  gender    n   percent
##  female  738 0.4221968
##    male 1010 0.5778032
##   Total 1748 1.0000000

Ages in dataset

table(counts_and_meta$pedaya, useNA = "always")
## 
##                  No Yes, age < 30 years                <NA> 
##                  51                2031                  95
table(subset(counts_and_meta, is.na(pedaya))$dataset_prefix)
## 
## TARGET  THR25  THR33  THR39 
##     73      1      2     19
# n_ped_aya <- sum(counts_and_meta$pedaya=="Yes, age < 30 years" | grepl("TARGET", counts_and_meta$dataset_prefix), na.rm = TRUE)
# at least one target sample is >30

n_ped_aya <- sum(counts_and_meta$pedaya=="Yes, age < 30 years" | counts_and_meta$age_at_dx<30, na.rm = TRUE)
n_ped_aya
## [1] 2102
n_ped_aya/nrow(counts_and_meta)
## [1] 0.9655489
# more than 95% of which are pediatric <30; of the Datasets with specific ages at diagnosis reported, the median was 7

median(counts_and_meta$age_at_dx, na.rm = TRUE)
## [1] 8.96
table(is.na(counts_and_meta$age_at_dx))
## 
## FALSE  TRUE 
##  1738   439

Read lengths in dataset

seq_len_round_25 <- round(counts_and_meta$seq_length / 25) * 25
table(seq_len_round_25, useNA = "always")
## seq_len_round_25
##   25   50   75  100  125 <NA> 
##    4  238  366 1542   27    0
tabyl(seq_len_round_25)
##  seq_len_round_25    n     percent
##                25    4 0.001837391
##                50  238 0.109324759
##                75  366 0.168121268
##               100 1542 0.708314194
##               125   27 0.012402389
table(is.na(counts_and_meta$seq_length))
## 
## FALSE 
##  2177
table(counts_and_meta$seq_length)
## 
##   36   50   51   75   76   81  100  101  110  125 
##    4   11  227  279   40   47   98 1429   15   27
summary(counts_and_meta$seq_length)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    36.0    76.0   101.0    91.5   101.0   125.0
IQR(counts_and_meta$seq_length, na.rm = TRUE)
## [1] 25
 old_read_length_histogram <-  ggplot(counts_and_meta) + 
  geom_histogram(aes(seq_length, group=cohort), 
                 fill = "lightgrey", color = "black", stat = StatBin2) +
      theme_minimal(base_size = 20) +
    theme(axis.line.x = element_line(color="darkgrey", size = 0.5),
        axis.line.y = element_line(color="darkgrey", size = 0.5)) +
  xlab("Sequence length") +
    ylab("Datasets")
 
 read_length_histogram <-  ggplot(counts_and_meta) + 
  geom_histogram(aes(seq_length), 
                 fill = "lightgrey", color = "black", stat = StatBin2,
                 binwidth = 1) +
      theme_minimal(base_size = 20) +
    theme(axis.line.x = element_line(color="darkgrey", size = 0.5),
        axis.line.y = element_line(color="darkgrey", size = 0.5)) +
  xlab("Sequence length") +
    ylab("Datasets") +
   expand_limits(x=0)
  
read_length_histogram

subset(counts_and_meta, Mapped > Total_reads)
## # A tibble: 0 x 23
## # Groups:   cohort [0]
## # … with 23 variables: dataset_id <chr>, MND <dbl>, MEND <dbl>,
## #   Total_reads <dbl>, Mapped <dbl>, seq_length <dbl>, disease <chr>,
## #   dataset_prefix <chr>, age_at_dx <dbl>, pedaya <chr>, gender <chr>,
## #   doi_link <chr>, source_accession <chr>, repo <chr>,
## #   original_dataset_id <chr>, genes_expressed_above_0 <dbl>,
## #   genes_expressed_above_1 <dbl>, genes_expressed_above_2 <dbl>,
## #   genes_expressed_above_3 <dbl>, genes_expressed_above_4 <dbl>,
## #   genes_expressed_above_5 <dbl>, cohort <chr>, n_in_cohort <int>
subset(counts_and_meta, MND > Mapped)
## # A tibble: 0 x 23
## # Groups:   cohort [0]
## # … with 23 variables: dataset_id <chr>, MND <dbl>, MEND <dbl>,
## #   Total_reads <dbl>, Mapped <dbl>, seq_length <dbl>, disease <chr>,
## #   dataset_prefix <chr>, age_at_dx <dbl>, pedaya <chr>, gender <chr>,
## #   doi_link <chr>, source_accession <chr>, repo <chr>,
## #   original_dataset_id <chr>, genes_expressed_above_0 <dbl>,
## #   genes_expressed_above_1 <dbl>, genes_expressed_above_2 <dbl>,
## #   genes_expressed_above_3 <dbl>, genes_expressed_above_4 <dbl>,
## #   genes_expressed_above_5 <dbl>, cohort <chr>, n_in_cohort <int>
subset(counts_and_meta, MEND > MND)
## # A tibble: 0 x 23
## # Groups:   cohort [0]
## # … with 23 variables: dataset_id <chr>, MND <dbl>, MEND <dbl>,
## #   Total_reads <dbl>, Mapped <dbl>, seq_length <dbl>, disease <chr>,
## #   dataset_prefix <chr>, age_at_dx <dbl>, pedaya <chr>, gender <chr>,
## #   doi_link <chr>, source_accession <chr>, repo <chr>,
## #   original_dataset_id <chr>, genes_expressed_above_0 <dbl>,
## #   genes_expressed_above_1 <dbl>, genes_expressed_above_2 <dbl>,
## #   genes_expressed_above_3 <dbl>, genes_expressed_above_4 <dbl>,
## #   genes_expressed_above_5 <dbl>, cohort <chr>, n_in_cohort <int>

Summarize total read counts

min(counts_and_meta$Total_reads)
## [1] 206916
Total_read_counts <- counts_and_meta %>% 
  ungroup %>%
  mutate(dataset_value = Total_reads/1e6) %>%
  summarize(variance= var(dataset_value),
            min = min(dataset_value),
            max = max(dataset_value),
            median = median(dataset_value),
            p25 = quantile(dataset_value, 0.25),
            p75 = quantile(dataset_value, 0.75),
            iqr = IQR(dataset_value))

kable(Total_read_counts %>%
        select(-variance, -p25, -p75), digits = 1)
min max median iqr
0.2 668 61.2 53.3
total_read_count_histogram <-  ggplot(counts_and_meta) + 
  geom_histogram(aes(Total_reads/1E6), 
                 fill = "lightgrey", color = "black", stat = StatBin2) +
      theme_minimal(base_size = 20) +
    theme(axis.line.x = element_line(color="darkgrey", size = 0.5),
        axis.line.y = element_line(color="darkgrey", size = 0.5)) +
  xlab("Total reads") +
    ylab("Datasets")

total_read_count_histogram
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Calculate percentages

 read_counts_with_read_type_fractions <- counts_and_meta  %>%
  arrange(desc(Total_reads)) %>%
  mutate(
    pct_unmapped_of_total = (Total_reads - Mapped) / Total_reads,
    pct_dupe_of_mapped = (Mapped - MND) / Mapped,
    pct_non_exonic_of_non_dupe = (MND - MEND) / MND
  )

read_type_fractions_long <- read_counts_with_read_type_fractions %>%
  pivot_longer(cols = starts_with("pct_"), names_to = "read_type_fraction_name", values_to = "dataset_value") 

Summarize read type fractions

comparison_of_distributions <- read_type_fractions_long %>% 
  group_by(read_type_fraction_name) %>%
  summarize(variance= var(dataset_value),
            min = min(dataset_value),
            max = max(dataset_value),
            median = median(dataset_value),
            p25 = quantile(dataset_value, 0.25),
            p75 = quantile(dataset_value, 0.75),
            iqr = IQR(dataset_value))
## `summarise()` ungrouping output (override with `.groups` argument)
kable(comparison_of_distributions %>%
      mutate_if(is.double, ~100*.), digits = 3)
read_type_fraction_name variance min max median p25 p75 iqr
pct_dupe_of_mapped 5.884 2.853 99.997 26.904 13.453 43.072 29.618
pct_non_exonic_of_non_dupe 4.096 4.495 97.000 24.769 16.196 37.278 21.081
pct_unmapped_of_total 0.606 0.710 76.677 3.414 2.740 6.013 3.273
kable(comparison_of_distributions %>%
        select(-variance, -p25, -p75) %>%
      mutate_if(is.double, ~100*.), digits = 0)
read_type_fraction_name min max median iqr
pct_dupe_of_mapped 3 100 27 30
pct_non_exonic_of_non_dupe 4 97 25 21
pct_unmapped_of_total 1 77 3 3

statements about read type fractions

Overview

comparison_of_distributions
## # A tibble: 3 x 8
##   read_type_fraction_name    variance     min   max median    p25    p75    iqr
##   <chr>                         <dbl>   <dbl> <dbl>  <dbl>  <dbl>  <dbl>  <dbl>
## 1 pct_dupe_of_mapped          0.0588  0.0285  1.00  0.269  0.135  0.431  0.296 
## 2 pct_non_exonic_of_non_dupe  0.0410  0.0450  0.970 0.248  0.162  0.373  0.211 
## 3 pct_unmapped_of_total       0.00606 0.00710 0.767 0.0341 0.0274 0.0601 0.0327

unmapped

# in 77 datasets, more than 25% of reads are unmapped
table(read_counts_with_read_type_fractions$pct_unmapped_of_total>0.25)
## 
## FALSE  TRUE 
##  2100    77

duplicate

# 426 datasets have more than 50% duplicates (Fig 2A).
read_counts_with_read_type_fractions %>%
  mutate(above_50 = pct_dupe_of_mapped>0.50) %>%
  tabyl(above_50)
##  above_50    n   percent
##     FALSE 1751 0.8043179
##      TRUE  426 0.1956821
fiftypct <- read_counts_with_read_type_fractions %>%
  group_by(cohort) %>%
  summarize(has_a_dataset_with_more_than_50pct_dupes = any(pct_dupe_of_mapped>0.50),
            median_pct_dupe_of_mapped = median(pct_dupe_of_mapped)) %>%
  mutate(median_lt_50 = median_pct_dupe_of_mapped < 0.5,
         median_lt_50_and_has_a_dataset = median_lt_50 & has_a_dataset_with_more_than_50pct_dupes)
## `summarise()` ungrouping output (override with `.groups` argument)
fiftypct %>% tabyl(median_lt_50)
##  median_lt_50  n   percent
##         FALSE  7 0.1707317
##          TRUE 34 0.8292683
fiftypct %>% tabyl(has_a_dataset_with_more_than_50pct_dupes)
##  has_a_dataset_with_more_than_50pct_dupes  n   percent
##                                     FALSE  9 0.2195122
##                                      TRUE 32 0.7804878
fiftypct %>% tabyl(median_lt_50_and_has_a_dataset)
##  median_lt_50_and_has_a_dataset  n   percent
##                           FALSE 16 0.3902439
##                            TRUE 25 0.6097561

exonic

read_counts_with_read_type_fractions %>%
  mutate(above_50 = pct_non_exonic_of_non_dupe>0.50) %>%
  tabyl(above_50)
##  above_50    n   percent
##     FALSE 1847 0.8484153
##      TRUE  330 0.1515847
fiftypcte <- read_counts_with_read_type_fractions %>%
  group_by(cohort) %>%
  summarize(has_a_dataset_with_more_than_50pct_ne = any(pct_non_exonic_of_non_dupe>0.50),
            median_pct_dupe_of_mapped = median(pct_non_exonic_of_non_dupe)) %>%
  mutate(median_lt_50 = median_pct_dupe_of_mapped < 0.5,
         median_lt_50_and_has_a_dataset = median_lt_50 & has_a_dataset_with_more_than_50pct_ne)
## `summarise()` ungrouping output (override with `.groups` argument)
fiftypcte %>% tabyl(median_lt_50)
##  median_lt_50  n    percent
##         FALSE  3 0.07317073
##          TRUE 38 0.92682927
fiftypcte %>% tabyl(has_a_dataset_with_more_than_50pct_ne)
##  has_a_dataset_with_more_than_50pct_ne  n   percent
##                                  FALSE 28 0.6829268
##                                   TRUE 13 0.3170732
fiftypcte %>% tabyl(median_lt_50_and_has_a_dataset)
##  median_lt_50_and_has_a_dataset  n   percent
##                           FALSE 31 0.7560976
##                            TRUE 10 0.2439024

Correlation of depth of sequencing to duplicate read fraction

Does the depth of sequencing correlate to the number of duplicates? How much variance does that explain?

cor_tot_reads_and_dupes <- cor(read_counts_with_read_type_fractions$Total_reads, 
    read_counts_with_read_type_fractions$pct_dupe_of_mapped,
    method = c("pearson"))
round(cor_tot_reads_and_dupes, 2)
## [1] 0.58
round(cor_tot_reads_and_dupes^2, 2)
## [1] 0.34

Plot correlation of depth of sequencing to duplicate read fraction

this_data <- read_counts_with_read_type_fractions %>%
  select(Total_reads, pct_dupe_of_mapped)
## Adding missing grouping variables: `cohort`
this_plot_title <- paste("Correlation of Total_reads and % Duplicates")

these_stats <- this_data  %>%
  ungroup %>%
  summarize(r_corr = round(cor(Total_reads, pct_dupe_of_mapped), 2))

ggplot(this_data,
       aes(x=Total_reads/1E6, y=pct_dupe_of_mapped)) + 
  geom_point(alpha = 0.5, shape = 20, size =0.1) +
  geom_smooth(method = "lm", color = "black", se = FALSE) +
  geom_label(data = these_stats, 
             aes(label=paste0("r=", r_corr)), 
             x=max(this_data$Total_reads/1E6),
             y=max(this_data$pct_dupe_of_mapped),
             hjust = 1, vjust = 1
             ) +
    theme_minimal(base_size = 20) +
    theme(axis.line.x = element_line(color="darkgrey", size = 0.5),
        axis.line.y = element_line(color="darkgrey", size = 0.5)) +
  ggtitle(this_plot_title)  + 
  theme(legend.position="none", plot.title=element_text(size=22)) +
  xlab("Read counts (million)") +
  ylab("% Duplicates")
## `geom_smooth()` using formula 'y ~ x'

Generate schematic defining read types

colors_for_read_types <- these_colors_with_MEND
names(colors_for_read_types) <- gsub(" reads", "", names(these_colors_with_MEND))

colors_for_read_types <- c(colors_for_read_types, 
                           "Genome" = "darkblue",
                           "Exon" = "darkblue",
                           "Aligned reads" = "darkgrey")

plot1_colnames <- c("label", "x1", "x2", "y1", "y2")

read_type_schematic_data_as_vector <- c("Duplicate", 3, 4, 5, 6, 
                       "MEND", 3, 4, 4, 5, 
                       "MEND", 2.5, 3.5, 3, 4, 
                       "Non-exonic", 4.5, 5.5, 3, 4,
                       "Unmapped", 6, 7, 3, 4,
                       "Exon", 2.5, 4, 1, 2,
                       "Aligned reads", 2.8, 2.8, 6, 7)

rows_in_schematic <- length(read_type_schematic_data_as_vector)/length(plot1_colnames)

read_type_schematic_data <- matrix(read_type_schematic_data_as_vector, 
                     ncol = length(plot1_colnames), byrow = TRUE,
                     dimnames = list(c(1:rows_in_schematic), plot1_colnames)) %>%
  as_tibble %>%
  type_convert() %>%
  mutate(
    base_color = colors_for_read_types[match(label, names(colors_for_read_types))],
    border_color = case_when(
      label %in% c("Aligned reads", "Genome", "MEND", "Exon") ~ NA_character_,
      TRUE ~ base_color),
    fill_color = ifelse(label %in% c("MEND", "Exon"),  base_color, "white"),
    text_color = case_when(
      label %in% c("MEND", "Exon")~  "white",
      TRUE ~ base_color),
    mid_bar_x = map2_dbl(x1, x2,  function(x, y) mean(c(x,y))),
    mid_bar_y = map2_dbl(y1, y2,  function(x, y) mean(c(x,y)))
  )
## Parsed with column specification:
## cols(
##   label = col_character(),
##   x1 = col_double(),
##   x2 = col_double(),
##   y1 = col_double(),
##   y2 = col_double()
## )
padding_size = 0.35
end_of_box <- sort(read_type_schematic_data$x2,partial=length(read_type_schematic_data$x2)-1)[length(read_type_schematic_data$x2)-1] + padding_size
read_type_schematic <- ggplot(read_type_schematic_data, 
                              aes(xmin=x1, xmax=x2, ymin=y1, ymax = y2, 
                                  fill = fill_color, color = border_color)) +
  geom_rect() +
  
  geom_segment(x=min(read_type_schematic_data$x1-padding_size), y=1.5, xend = 5.75, yend = 1.5, color = "darkblue") +   
  geom_text(aes(label = label, x = mid_bar_x, y=mid_bar_y,
                color = text_color), 
            size = 4,
            fontface = "bold") +
  geom_rect(aes(xmin = min(x1 - padding_size), 
                ymin = 2.5, 
                xmax = end_of_box,
                ymax=max(y2 + padding_size)), 
            fill = NA, color = "darkgrey", size = 0.25) +
  scale_fill_identity() +
  scale_color_identity() +
  theme_void() + 
  theme(
    panel.grid.major.x = element_line(color="lightgrey", size=0.2),
    ) + 
 scale_x_continuous(
   breaks = seq(min(read_type_schematic_data$x1) - padding_size + 0.25, end_of_box , 0.25))

read_type_schematic

Generate schematic defining read type fractions

stat_names <- c("pct_unmapped_of_total",  "pct_dupe_of_mapped", "pct_non_exonic_of_non_dupe")
stat_labels <- c("Unmapped",  "Duplicate", "Non-exonic") 
complement_stat_names <- c("Mapped reads",  "Non-duplicate reads", "Exonic reads") 
divisor_name = c("total", "mapped", "non-duplicate")

comparison_of_remaining <- read_type_fractions_long %>% 
  mutate(read_type_fraction_name = complement_stat_names[match(read_type_fraction_name, stat_names)]) %>%
  group_by(read_type_fraction_name) %>%
  summarize(min = round(1-max(dataset_value), 2),
    max = round(1-min(dataset_value), 2),
    median = round(1-median(dataset_value), 2),
    p25 = round(1-quantile(dataset_value, 0.25), 2),
    p75 = round(1-quantile(dataset_value, 0.75), 2)) %>%
      mutate(bar_label = paste0 (read_type_fraction_name, "\n(", 100*min, "-", 100*max, "% of ", divisor_name[match(read_type_fraction_name, complement_stat_names)], "; x͂=",100* median, "%)"))
## `summarise()` ungrouping output (override with `.groups` argument)
positive_bars <- comparison_of_remaining %>%
  select(-min, -max, -p25, -p75) %>%
  mutate(read_type_fraction_label = read_type_fraction_name, 
         position = "1") %>%
  rename(abs_median = median)

negative_bars <- tibble(read_type_fraction_name = positive_bars$read_type_fraction_name,
                        read_type_fraction_label = stat_labels[match(read_type_fraction_name, complement_stat_names)],
                        bar_label = read_type_fraction_label,
                        abs_median = 1-positive_bars$abs_median,
                        position = "2")

total_bar <- tibble(read_type_fraction_name = "Total reads",
                    read_type_fraction_label = read_type_fraction_name,
                    bar_label = "Total reads\n(100% of reads)",
                    abs_median = 1,
                    position = "1"
                    )


MEND_stats_of_total <- counts_and_meta  %>%
  arrange(desc(Total_reads)) %>%
  mutate(
    pct_MEND_of_total = (Total_reads - MEND) / Total_reads
  ) %>%
  ungroup %>% 
  summarize(min = round(1-max(pct_MEND_of_total), 2),
    max = round(1-min(pct_MEND_of_total), 2),
    median = round(1-median(pct_MEND_of_total), 2))

MEND_bar <- tibble(read_type_fraction_name = "MEND reads",
                    read_type_fraction_label = read_type_fraction_name,
                    bar_label = with(MEND_stats_of_total, paste0 ("MEND reads\n(", 100*min, "-", 100*max, "% of total reads; x͂=",100* median, "%)")),
                    abs_median = 1,
                    position = "1"
                    )

bar_names_in_order <- c("Total reads", "Mapped reads", "Non-duplicate reads", "Exonic reads", "MEND reads")
  
all_bars <- bind_rows(positive_bars, negative_bars, total_bar, MEND_bar) %>%
  mutate(read_type_fraction_name = factor(read_type_fraction_name, 
                                          levels = bar_names_in_order))

cat_space <- all_bars %>% filter(position == 1) %>%
  arrange(read_type_fraction_name) %>%
  select(read_type_fraction_name, abs_median) %>%
  ungroup %>%
  mutate(lagged1_abs_median = lag(abs_median, default =1),
         lagged2_abs_median = lag(abs_median, n=2, default =1),
         category_space = lagged1_abs_median*lagged2_abs_median
         ) %>%
  select(read_type_fraction_name, category_space)

these_colors_with_MEND = c(these_colors, "MEND reads"="#000000")


all_bars_rel <- left_join(all_bars, cat_space, by = "read_type_fraction_name") %>%
  mutate(rel_median=ifelse(read_type_fraction_name == "MEND reads",
                           abs_median[read_type_fraction_label=="Exonic reads"]*
                             category_space[read_type_fraction_label=="Exonic reads"], 
                           abs_median*category_space),
         read_type_fraction_name = factor(read_type_fraction_name, levels = rev(bar_names_in_order)),
         position = factor(position, levels = rev(sort(unique(position)))),
         bar_color = these_colors_with_MEND[match(read_type_fraction_name, names(these_colors_with_MEND))],
         fill_val = ifelse(position == 1, bar_color, NA),
         border_color_val = bar_color,
         text_color = ifelse(position ==1, "white", "black"),
font_size = ifelse(abs_median<0.1, 3,3.5),
text_angle = ifelse(abs_median<0.1, 90,0)
         ) 
read_type_fractions_schematic <- ggplot(all_bars_rel, aes(y=read_type_fraction_name,
                x=rel_median, group = position)) + 
   geom_col(aes(fill = fill_val,
                color = border_color_val
                )) +
   geom_text(aes(label=bar_label,
                 color = text_color,
                 size = font_size,
                 angle= text_angle), 
             position = position_stack(vjust = .5),
              fontface = "bold") +
   scale_fill_identity() +
   scale_color_identity()  +
  scale_size_identity() + 
   theme_void() +
  theme(axis.title.y=element_blank(),
        axis.text.y=element_blank(),
        axis.ticks.y=element_blank()) 


ggplot(all_bars_rel, aes(y=read_type_fraction_name,
                x=rel_median, group = position)) + 
   geom_col(aes(fill = fill_val,
                color = border_color_val
                )) +
   geom_text(aes(label=read_type_fraction_label,
                 color = text_color,
                 size = font_size,
                 angle = text_angle), 
             position = position_stack(vjust = .5),
              fontface = "bold") +
   scale_fill_identity() +
   scale_color_identity()  +
  scale_size_identity() + 
   theme_void() +
  theme(axis.title.y=element_blank(),
        axis.text.y=element_blank(),
        axis.ticks.y=element_blank())

read_type_fractions_schematic

figure_name <- "f1"

right_side <- plot_grid(read_type_schematic,
read_type_fractions_schematic,
nrow=2,
rel_heights = c(1, 2),
labels = c("C", "D"))

left_side <- plot_grid(cohort_size_histogram,
read_length_histogram,
nrow=2,
labels = c("A", "B"))

plot_grid(left_side,
right_side,
ncol = 2) +
   draw_label(paste(figure_name, Sys.time()),
              x = 0, y = 0.02, hjust = 0, size = 6) 

stat_names <- c("pct_unmapped_of_total",  "pct_dupe_of_mapped", "pct_non_exonic_of_non_dupe")
stat_label <- c("% unmapped", "% duplicate \n(of mapped)", "% non-exonic \n(of non-duplicate)")
names(stat_label) <- stat_names

Plot read fractions

colors_for_stat_names <- these_colors[c("Mapped reads", "Non-duplicate reads","Exonic reads")]
names(colors_for_stat_names) <- stat_names

these_colors <- c("#A6CEE3", "#1F78B4", "#B2DF8A", "#33A02C", "#FB9A99", "#E31A1C", 
"#FDBF6F", "#FF7F00")
# light blue, dark blue, etc: green, red, orange (then purple and brown)
names(these_colors) <-  c("not assigned", "Total reads", "Non-exonic reads",  "Exonic reads", "Duplicate reads",  "Non-duplicate reads",  "Unmapped reads", "Mapped reads")

# 6 x 11 was a good ratio, but the text was too small

read_type_fractions_long_for_histogram <- read_type_fractions_long %>%
  mutate(read_type_fraction_name = factor(read_type_fraction_name,
                                          levels = stat_names))

read_type_fractions_histogram <-  ggplot(read_type_fractions_long_for_histogram) +
    geom_histogram(aes(x = dataset_value, color = read_type_fraction_name), 
                   fill = "white", stat = StatBin2)  +
  scale_color_manual(values = colors_for_stat_names) +
  facet_wrap(~read_type_fraction_name, 
             nrow = 1, 
             scales = "free_y",
             labeller = labeller(
               read_type_fraction_name = stat_label
               )
             ) +
  ylab("Datasets") +
  xlab(paste0("Read type percentages in ", length(unique(read_type_fractions_long_for_histogram$dataset_id)),  " datasets")) +
  scale_x_continuous(breaks = seq(0, 1, by = 0.5)) +
    theme_minimal() +
    theme(axis.line.x = element_line(color="darkgrey", size = 0.5),
        axis.line.y = element_line(color="darkgrey", size = 0.5)) +
    theme(strip.text.x = element_text(size = 12, face = "bold")) +
    theme(legend.position = "none")

read_type_fractions_histogram
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Calculate MEND reads as a fraction of total

 read_counts_with_MEND_fractions <- counts_and_meta  %>%
  arrange(desc(Total_reads)) %>%
  mutate(pct_MEND_of_total = MEND /Total_reads)

ggplot(read_counts_with_MEND_fractions) + 
  geom_histogram(aes(pct_MEND_of_total), stat = StatBin2) +
    theme_minimal(base_size = 20) +
    theme(axis.line.x = element_line(color="darkgrey", size = 0.5),
        axis.line.y = element_line(color="darkgrey", size = 0.5)) 
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

MEND_pct <- read_counts_with_MEND_fractions %>% 
  ungroup %>%
  mutate(dataset_value = pct_MEND_of_total) %>%
  summarize(variance= var(dataset_value),
            min = min(dataset_value),
            max = max(dataset_value),
            median = median(dataset_value),
            p25 = quantile(dataset_value, 0.25),
            p75 = quantile(dataset_value, 0.75),
            iqr = IQR(dataset_value))

kable(MEND_pct %>%
        select(-variance, -p25, -p75) %>%
      mutate_if(is.double, ~100*.), digits = 0)
min max median iqr
0 79 50 31

Show per-cohort distribution of read_type_fractions

figure_name <- "fracs_by_group"

read_type_fractions_long_for_boxplot <- read_type_fractions_long_for_histogram %>%
  ungroup %>%
  mutate(boxplot_label = fct_reorder(paste0(as.character(cohort), ", n=", n_in_cohort), n_in_cohort))


read_type_fractions_per_cohort_boxplot <- ggplot(read_type_fractions_long_for_boxplot) +
  geom_boxplot(aes(x = dataset_value, y=boxplot_label), outlier.size = 0.5)  +
  facet_wrap(~read_type_fraction_name, 
             nrow = 1, 
             labeller = labeller(
               read_type_fraction_name = stat_label
             )
  ) +
  ylab("Cohorts") +
  xlab(paste0("Read type percentages in ", length(unique(read_type_fractions_long_for_boxplot$dataset_id)),  " datasets")) +
  scale_x_continuous(breaks = seq(0, 1, by = 0.5)) +
    theme_minimal() +
    theme(axis.line.x = element_line(color="darkgrey", size = 0.5),
        axis.line.y = element_line(color="darkgrey", size = 0.5)) +
  theme(strip.text.x = element_text(size = 12, face = "bold")) +
  theme(legend.position = "none")


read_type_fractions_per_cohort_boxplot

Characterize EGAD00001003279

Northcott, P.A., Buchhalter, I., Morrissy, A.S., Hovestadt, V., Weischenfeldt, J., Ehrenberger, T., Gröbner, S., Segura-Wang, M., Zichner, T., Rudneva, V.A., et al [Lichter]. (2017). The whole-genome landscape of medulloblastoma subtypes. Nature 547, 311–317.

two of the senior authors are from the German Cancer Research Center (DKFZ), one is Roland Eils German Cancer Research Center (DKFZ) Marco A. Marra from BC, Stefan M. Pfister German Cancer Research Center (DKFZ) Michael D. Taylor Sick kids Peter Lichter German Cancer Research Center (DKFZ)

RNA-Seq usage 1) CESAM integrates structural variant-derived breakpoints with RNA-seq data to identify expression changes associated with breakpoints in cis as described in previously37, by performing linear regression of expression (molecular phenotype) on structural variant-derived breakpoint (somatic genotype) data.

  1. Expression, e.g. d, t-SNE plots showing the relative, normalized expression intensities of GFI1, GFI1B, MYC, MYCN and PRDM6 in methylation subtypes (n = 219). e, Expression heat map showing transcriptional diversity among new MB subtypes (n = 248). (but could be from Affy)

Transcriptome data were acquired through RNA sequencing (RNA-seq; n = 164) and Affymetrix expression arrays (n = 392).

https://www.nature.com/articles/nature22973

read_type_names <- c("Total_reads", "Mapped",  "MND", "MEND")
this_cohort <- "EGAD00001003279"

in MS

 read_counts_with_read_type_fractions %>%
  filter(cohort == this_cohort) %>%
   mutate(gt_98 = pct_dupe_of_mapped > 0.98) %>%
   tabyl(gt_98) %>%
   add_totals_row
## Warning: 'add_totals_row' is deprecated.
## Use 'adorn_totals("row")' instead.
## See help("Deprecated")
##  gt_98   n   percent
##  FALSE  55 0.4330709
##   TRUE  72 0.5669291
##  Total 127 1.0000000
read_counts_with_read_type_fractions %>%
  filter(cohort == this_cohort) %>%
  filter(pct_dupe_of_mapped > 0.98) %>%
  pull(Total_reads) %>% min
## [1] 178294341

other

 counts_and_meta_long <- counts_and_meta  %>%
  pivot_longer(cols = c(Total_reads, Mapped, MEND, MND), names_to = "read_type_name", values_to = "dataset_value")  %>%
   ungroup %>%
   mutate(read_type_name = factor(read_type_name, levels = read_type_names))

counts_and_meta_long_one_proj <-  counts_and_meta_long %>%
  filter(cohort == this_cohort)

table(counts_and_meta_long_one_proj$read_type_name)
## 
## Total_reads      Mapped         MND        MEND 
##         127         127         127         127
# 78 datasets in the cohort have fewer than 0.2 million MEND reads. 
sum((counts_and_meta_long_one_proj %>% filter(read_type_name=="MEND"))$dataset_value < 200000)
## [1] 78
those_datasets <- counts_and_meta_long_one_proj %>% filter(read_type_name=="MEND", dataset_value < 200000) %>% pull(dataset_id)


summary((counts_and_meta_long_one_proj %>% filter(read_type_name=="Total_reads", dataset_id %in% those_datasets))$dataset_value)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## 178294341 199916664 208667776 211606891 223832191 247618832
max_plots_to_include <- 10
n_datasets_to_plot <- ifelse(n_distinct(counts_and_meta_long_one_proj$dataset_id)>max_plots_to_include, 
                         max_plots_to_include, 
                         n_distinct(counts_and_meta_long_one_proj$dataset_id))
plot_word <- ifelse(n_datasets_to_plot==max_plots_to_include, max_plots_to_include, paste("all", n_distinct(counts_and_meta_long_one_proj$dataset_id)))

set.seed(100)
datasets_to_plot <- sample(unique(counts_and_meta_long_one_proj$dataset_id), n_datasets_to_plot)

ggplot(subset(counts_and_meta_long_one_proj, dataset_id %in% datasets_to_plot)) + 
  geom_col(aes(x=read_type_name, y=dataset_value/1e6, fill = read_type_name)) +
  facet_wrap(~dataset_id, ncol = 1, strip.position="right")  +
    theme_minimal(base_size = 20) +
    theme(axis.line.x = element_line(color="darkgrey", size = 0.5),
        axis.line.y = element_line(color="darkgrey", size = 0.5)) +
  theme(strip.text.y = element_text(angle = 0)) +
  scale_fill_brewer(palette = "Set1")  +
  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) +
  ggtitle(paste("Read counts for", plot_word, "datasets from", this_cohort)) +
  coord_flip()

ggplot(counts_and_meta_long_one_proj %>% filter(read_type_name == "MEND")) + 
  geom_histogram(aes(dataset_value/1e6), stat = StatBin2) +
    theme_minimal(base_size = 20) +
    theme(axis.line.x = element_line(color="darkgrey", size = 0.5),
        axis.line.y = element_line(color="darkgrey", size = 0.5)) +
  ggtitle(paste("Distribution of MEND counts in", this_cohort))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

bl <- colorRampPalette(c("navy","royalblue","lightskyblue"))(200)                      
re <- colorRampPalette(c("mistyrose", "red2","darkred"))(200)


ggplot(counts_and_meta %>% 
         filter(cohort == this_cohort) %>%
         mutate(pct_dupes = (Mapped - MND)/Mapped)) + 
  geom_point(aes(x=MEND/1e6, y=pct_dupes, fill = Total_reads/1e6), shape=21, color = "black") +
    theme_minimal(base_size = 20) +
    theme(axis.line.x = element_line(color="darkgrey", size = 0.5),
        axis.line.y = element_line(color="darkgrey", size = 0.5)) +
  scale_fill_gradientn(colours = c(bl,"white", re)) +
  ggtitle(paste("Relationship of dupe fraction to MEND count in", this_cohort), "For detecting whether datasets with less info were sequenced more deeply. \nNormally, higher depth datasets have more MEND reads and more duplicate \nreads than the same dataset sequenced at a lower total depth.")

counts_and_meta_long_one_proj %>%
  select(seq_length, dataset_id) %>%
  distinct %>%
  tabyl(seq_length)
##  seq_length   n percent
##          51 127       1

Expressed genes

What is the distribution of measured genes?

gene_counts_long <- counts_and_meta %>%
  pivot_longer(cols = starts_with("genes_"), names_to = "Expression_threshold")  %>%
  mutate(Expression_threshold = paste(str_replace(Expression_threshold, "genes_expressed_above_", "genes > "), "TPM"))  


ggplot(gene_counts_long) + 
  geom_histogram(aes(x=value/1e3, fill = Expression_threshold), stat = StatBin) +
  facet_wrap(~Expression_threshold)  +
    theme_minimal() +
  scale_fill_brewer(palette = "Set1") +
    theme(axis.line.x = element_line(color="darkgrey", size = 0.5),
        axis.line.y = element_line(color="darkgrey", size = 0.5)) +
#  theme_linedraw(base_size = 20) +
    theme(legend.position = "none") +
  # scale_fill_brewer(palette = "Set1") +
  ylab("Datasets") +
  xlab("Measured genes") 
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

What are the exact lowest gene counts?

sort(counts_and_meta$genes_expressed_above_0) [1:100]
##   [1]     9     9    12    12    13    14    14    15    16    16    17    18
##  [13]    18    18    18    18    18    19    19    19    19    20    20    20
##  [25]    20    20    20    21    21    21    21    21    21    22    22    22
##  [37]    23    23    23    23    23    24    24    24    24    24    24    24
##  [49]    25    25    25    26    26    26    26    27    27    27    27    28
##  [61]    28    28    28    29    29    29    29    29    30    30    31    31
##  [73]    32    33    35    36    37    40  6098  7107 11830 15323 16793 16837
##  [85] 16943 17233 17531 18288 18588 18704 19626 19693 19749 19781 19942 19963
##  [97] 20063 20120 20203 20247

correlations with all datasets

library(corrr)


counts_and_meta %>% 
    ungroup %>% 
    select_if(is.double) %>%
       correlate() %>%
  filter(rowname %in% read_type_names) %>%
  rename(Read_type_count=rowname) %>% 
  mutate(Read_type_count = factor(Read_type_count, levels = read_type_names)) %>%
  arrange(Read_type_count) %>%
  select(c(Read_type_count, starts_with("genes_expressed"))) %>%
  kable(caption = "Correlations with all datasets", digits = 2)
## 
## Correlation method: 'pearson'
## Missing treated using: 'pairwise.complete.obs'
Correlations with all datasets
Read_type_count genes_expressed_above_0 genes_expressed_above_1 genes_expressed_above_2 genes_expressed_above_3 genes_expressed_above_4 genes_expressed_above_5
Total_reads -0.20 -0.40 -0.40 -0.40 -0.39 -0.39
Mapped -0.19 -0.41 -0.41 -0.41 -0.40 -0.39
MND 0.51 0.24 0.20 0.18 0.17 0.16
MEND 0.48 0.27 0.26 0.26 0.26 0.26

correlations with all but low gene count datasets

counts_and_meta %>% 
  filter(genes_expressed_above_0 > min_genes_gt_0) %>%
    ungroup %>% 
    select_if(is.double) %>%
       correlate() %>%
  filter(rowname %in% read_type_names) %>%
  rename(Read_type_count=rowname) %>% 
  mutate(Read_type_count = factor(Read_type_count, levels = read_type_names)) %>%
  arrange(Read_type_count) %>%
  select(c(Read_type_count, starts_with("genes_expressed"))) %>%
  kable(caption = "Correlations with all but low gene count datasets", digits = 2)
## 
## Correlation method: 'pearson'
## Missing treated using: 'pairwise.complete.obs'
Correlations with all but low gene count datasets
Read_type_count genes_expressed_above_0 genes_expressed_above_1 genes_expressed_above_2 genes_expressed_above_3 genes_expressed_above_4 genes_expressed_above_5
Total_reads 0.13 -0.26 -0.28 -0.28 -0.28 -0.28
Mapped 0.21 -0.25 -0.26 -0.27 -0.27 -0.27
MND 0.51 0.04 0.02 0.01 0.00 0.00
MEND 0.44 0.08 0.09 0.11 0.12 0.12

Plot correlations betweeen gene counts and read types

Groom data

expr_gene_and_read_counts_to_plot <- counts_and_meta %>%
  pivot_longer (cols = c(MND, MEND, Total_reads, Mapped), names_to = "Read_type", values_to = "Read_count") %>%
  pivot_longer (cols = starts_with("genes_expressed"), names_to = "Expression_threshold", values_to = "Gene_count") %>%
  mutate(Read_type = factor(Read_type, levels = read_type_names))

Identify datasets with unusual gene counts or read depths

datasets_with_normal_expressed_gene_numbers <- counts_and_meta %>%
  filter(genes_expressed_above_0 > min_genes_gt_0) %>%
  pull(dataset_id)

datasets_with_outlier_gene_counts <- counts_and_meta %>%
  ungroup %>%
  filter(is_outlier(genes_expressed_above_0)) %>%
  pull(dataset_id) 

datasets_with_outlier_depths <- counts_and_meta %>%
  ungroup %>%
  filter(is_outlier(Total_reads)) %>%
  pull(dataset_id) 

Function for calculating gene count correlations and plotting results

plot_gene_count_correlations <- function(this_data = expr_gene_and_read_counts_to_plot, this_plot_title = "gene count correlations"){

this_data <- this_data %>% 
  mutate(Expression_threshold = paste(str_replace(Expression_threshold, "genes_expressed_above_", "genes > "), "TPM"))  
  
  
these_stats <- this_data  %>%
group_by(Read_type, Expression_threshold) %>%
  summarize(r_corr = round(cor(Gene_count, Read_count), 2)) 

# unique(this_data$Read_type)
# dput(unique(this_data$Read_type))
# these_colors_with_MEND

colors_for_correlation_plot <- c("Total_reads"="#1F78B4", "Mapped"="#FF7F00", 
"MND"="#E31A1C", "MEND"="#000000")

ggplot(this_data,
       aes(x=Read_count/1E6, y=Gene_count/1e3, color = Read_type)) + 
  geom_point(alpha = 0.5, shape = 20, size =0.1) +
  geom_smooth(method = "lm", color = "black") +
  geom_label(data = these_stats, 
             aes(label=paste0("r=", r_corr)), 
             x=max(this_data$Read_count/1E6),
             y=max(this_data$Gene_count/1e3),
             hjust = 1, vjust = 1
             ) +
    theme_minimal() +
    theme(axis.line.x = element_line(color="darkgrey", size = 0.5),
        axis.line.y = element_line(color="darkgrey", size = 0.5)) +
  scale_color_manual(values = colors_for_correlation_plot) +
  facet_grid(Read_type~Expression_threshold) +
  ggtitle(this_plot_title)  + 
  theme(legend.position="none") +
  xlab("Read counts (million)") +
  ylab("Gene counts (thousand)")

}

for all datasets

plot_gene_count_correlations(expr_gene_and_read_counts_to_plot,
                             "Correlations calculated across all datasets")
## `summarise()` regrouping output by 'Read_type' (override with `.groups` argument)
## `geom_smooth()` using formula 'y ~ x'

plot_gene_count_correlations(expr_gene_and_read_counts_to_plot %>% 
  filter(dataset_id %in% datasets_with_normal_expressed_gene_numbers),
  paste("Correlations in datasets with more than", min_genes_gt_0, "genes expressed above 0"))
## `summarise()` regrouping output by 'Read_type' (override with `.groups` argument)
## `geom_smooth()` using formula 'y ~ x'

plot_gene_count_correlations(expr_gene_and_read_counts_to_plot %>% 
  filter(! dataset_id %in% datasets_with_outlier_gene_counts),
  paste("Correlations calculated across all but outlier gene count datasets"))
## `summarise()` regrouping output by 'Read_type' (override with `.groups` argument)
## `geom_smooth()` using formula 'y ~ x'

plot_gene_count_correlations(expr_gene_and_read_counts_to_plot %>% 
  filter(! dataset_id %in% datasets_with_outlier_depths),
  paste("Correlations calculated across all but outlier read depth datasets"))
## `summarise()` regrouping output by 'Read_type' (override with `.groups` argument)
## `geom_smooth()` using formula 'y ~ x'

plot_gene_count_correlations(expr_gene_and_read_counts_to_plot %>% 
  filter(! dataset_id %in% datasets_with_outlier_depths, 
         ! dataset_id %in% datasets_with_outlier_gene_counts
         ),
  paste("Correlations calculated across all but datasets with outlier read depth or gene count"))
## `summarise()` regrouping output by 'Read_type' (override with `.groups` argument)
## `geom_smooth()` using formula 'y ~ x'

plot_gene_count_correlations(
  expr_gene_and_read_counts_to_plot %>% 
    filter(disease ==  "acute lymphoblastic leukemia"),
  "Correlations calculated among acute lymphoblastic leukemia datasets")
## `summarise()` regrouping output by 'Read_type' (override with `.groups` argument)
## `geom_smooth()` using formula 'y ~ x'

corr plot for figure

max_total_read_depth <- 250*1e6
print(min_genes_gt_0)
## [1] 100
sum(counts_and_meta$genes_expressed_above_0 <= min_genes_gt_0)
## [1] 78
sum(counts_and_meta$Total_reads > max_total_read_depth)
## [1] 105
# min_genes_gt_0
exclude_for_depth <- counts_and_meta %>%
  filter(Total_reads > max_total_read_depth) %>%
  pull(dataset_id)

exclude_for_gene_count <- counts_and_meta %>%
  filter(genes_expressed_above_0 < min_genes_gt_0) %>%
  pull(dataset_id)


this_data <- expr_gene_and_read_counts_to_plot %>% 
      filter(
        Expression_threshold %in% paste0("genes_expressed_above_", 1:3),
       ! dataset_id %in% exclude_for_depth, 
        ! dataset_id %in% exclude_for_gene_count
      ) %>%
  mutate(Expression_threshold = paste(str_replace(Expression_threshold, "genes_expressed_above_", "genes > "), "TPM"))  

n_distinct(this_data$dataset_id)
## [1] 1994
this_plot_title <- paste("Correlations calculated across all but datasets with excessive read depth or low gene count")

these_stats <- this_data  %>%
group_by(Read_type, Expression_threshold) %>%
  summarize(r_corr = round(cor(Gene_count, Read_count), 2))
## `summarise()` regrouping output by 'Read_type' (override with `.groups` argument)
colors_for_correlation_plot <- c("Total_reads"="#1F78B4", "Mapped"="#FF7F00", 
"MND"="#E31A1C", "MEND"="#000000")

cor_plots_excluding_high_read_depth_and_low_gene_count_datasets <- ggplot(this_data,
       aes(x=Read_count/1E6, y=Gene_count/1e3, color = Read_type)) + 
  geom_point(alpha = 0.5, shape = 20, size =0.1) +
  geom_smooth(method = "lm", color = "black") +
  geom_label(data = these_stats, 
             aes(label=paste0("r=", r_corr)), 
             x=max(this_data$Read_count/1E6),
             y=max(this_data$Gene_count/1e3),
             hjust = 1, vjust = 1
             ) +
    theme_minimal() +
    theme(axis.line.x = element_line(color="darkgrey", size = 0.5),
        axis.line.y = element_line(color="darkgrey", size = 0.5),
        strip.text = element_text(face = "bold")) +
  scale_color_manual(values = colors_for_correlation_plot) +
  facet_grid(Read_type~Expression_threshold) +
  theme(legend.position="none") +
  scale_x_continuous("Read counts (million)", n.breaks = 4) +
  ylab("Gene counts (thousand)") 


cor_plots_excluding_high_read_depth_and_low_gene_count_datasets
## `geom_smooth()` using formula 'y ~ x'

# Figure 2

f2a <- read_type_fractions_histogram +
            theme(strip.text.x = element_text(size = 10, face = "bold"),
                  axis.title.x=element_blank(),
                  axis.text.x=element_blank())

f2b <- read_type_fractions_per_cohort_boxplot +
            theme(strip.text.x = element_blank(),
                  strip.background = element_blank(),
                  strip.text = element_blank())

f2ab <- plot_grid(f2a, 
                  f2b,
                  ncol = 1,
                  rel_heights = c(1.5, 6),
                  align = "v",
                  labels = "AUTO")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
f2c <- plot_grid(cor_plots_excluding_high_read_depth_and_low_gene_count_datasets,
          labels = "C")
## `geom_smooth()` using formula 'y ~ x'
f2 <- plot_grid(f2ab,
                 f2c,
                 nrow=1,
                 rel_widths = c(4,2.3))

figure_name <- "f2"
f2 +   draw_label(paste(figure_name, Sys.time()),
             x = 0, y = 0.02, hjust = 0, size = 6) 

Plot sequence length against pct duplicates

ggplot(read_counts_with_read_type_fractions) + 
  geom_point(aes(x=seq_length, y=pct_dupe_of_mapped))

ggplot(read_counts_with_read_type_fractions) + 
  geom_boxplot(aes(x=seq_length, y=pct_dupe_of_mapped, group = seq_length))

Does the depth of sequencing correlate to the number of duplicates? How much variance does that explain?

cor_tot_reads_and_dupes <- cor(read_counts_with_read_type_fractions$Total_reads, 
    read_counts_with_read_type_fractions$pct_dupe_of_mapped,
    method = c("pearson"))
round(cor_tot_reads_and_dupes, 2)
## [1] 0.58
round(cor_tot_reads_and_dupes^2, 2)
## [1] 0.34

SessionInfo

pander(sessionInfo(), compact = FALSE)

R version 4.0.0 (2020-04-24)

Platform: x86_64-pc-linux-gnu (64-bit)

locale: LC_CTYPE=C.UTF-8, LC_NUMERIC=C, LC_TIME=C.UTF-8, LC_COLLATE=C.UTF-8, LC_MONETARY=C.UTF-8, LC_MESSAGES=C.UTF-8, LC_PAPER=C.UTF-8, LC_NAME=C, LC_ADDRESS=C, LC_TELEPHONE=C, LC_MEASUREMENT=C.UTF-8 and LC_IDENTIFICATION=C

attached base packages:

  • stats
  • graphics
  • grDevices
  • utils
  • datasets
  • methods
  • base

other attached packages:

  • corrr(v.0.4.2)
  • kableExtra(v.1.1.0)
  • pander(v.0.6.3)
  • RColorBrewer(v.1.1-2)
  • cowplot(v.1.0.0)
  • ggpubr(v.0.4.0)
  • ggthemes(v.4.2.0)
  • knitr(v.1.29)
  • viridis(v.0.5.1)
  • viridisLite(v.0.3.0)
  • janitor(v.1.2.0)
  • forcats(v.0.5.0)
  • stringr(v.1.4.0)
  • dplyr(v.1.0.0)
  • purrr(v.0.3.4)
  • readr(v.1.3.1)
  • tidyr(v.1.1.0)
  • tibble(v.3.0.1)
  • ggplot2(v.3.3.0)
  • tidyverse(v.1.3.0)

loaded via a namespace (and not attached):

  • httr(v.1.4.1)
  • splines(v.4.0.0)
  • jsonlite(v.1.7.0)
  • carData(v.3.0-4)
  • modelr(v.0.1.8)
  • assertthat(v.0.2.1)
  • highr(v.0.8)
  • blob(v.1.2.1)
  • cellranger(v.1.1.0)
  • yaml(v.2.2.1)
  • pillar(v.1.4.4)
  • backports(v.1.1.8)
  • lattice(v.0.20-41)
  • glue(v.1.4.1)
  • digest(v.0.6.25)
  • ggsignif(v.0.6.0)
  • rvest(v.0.3.5)
  • colorspace(v.1.4-1)
  • Matrix(v.1.2-18)
  • htmltools(v.0.5.0)
  • pkgconfig(v.2.0.3)
  • broom(v.0.5.6)
  • haven(v.2.3.1)
  • scales(v.1.1.1)
  • webshot(v.0.5.2)
  • openxlsx(v.4.1.5)
  • rio(v.0.5.16)
  • mgcv(v.1.8-31)
  • farver(v.2.0.3)
  • generics(v.0.0.2)
  • car(v.3.0-8)
  • ellipsis(v.0.3.1)
  • withr(v.2.2.0)
  • cli(v.2.0.2)
  • magrittr(v.1.5)
  • crayon(v.1.3.4)
  • readxl(v.1.3.1)
  • evaluate(v.0.14)
  • fs(v.1.4.2)
  • fansi(v.0.4.1)
  • nlme(v.3.1-147)
  • rstatix(v.0.6.0)
  • xml2(v.1.3.2)
  • foreign(v.0.8-78)
  • tools(v.4.0.0)
  • data.table(v.1.12.8)
  • hms(v.0.5.3)
  • lifecycle(v.0.2.0)
  • munsell(v.0.5.0)
  • reprex(v.0.3.0)
  • zip(v.2.0.4)
  • compiler(v.4.0.0)
  • rlang(v.0.4.6)
  • grid(v.4.0.0)
  • rstudioapi(v.0.11)
  • labeling(v.0.3)
  • rmarkdown(v.2.3)
  • gtable(v.0.3.0)
  • abind(v.1.4-5)
  • DBI(v.1.1.0)
  • curl(v.4.3)
  • R6(v.2.4.1)
  • gridExtra(v.2.3)
  • lubridate(v.1.7.9)
  • utf8(v.1.1.4)
  • stringi(v.1.4.6)
  • Rcpp(v.1.0.4.6)
  • vctrs(v.0.3.1)
  • dbplyr(v.1.4.4)
  • tidyselect(v.1.1.0)
  • xfun(v.0.15)